In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
from scipy.stats import chi2_contingency
import sys
import os

pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_rows', 5000)
In [2]:
sys.path.append(os.path.abspath('../src'))
import f_aux

Cargo todas las librerías que voy a necesitar así como descargo las que no había utilizado hasta ahora (!pip install scikit-learn) y que voy a utilizar en este cuaderno. A continuación añado las funciones que nos han dado los profesores y que se van a utilizar a lo largo de este cuaderno

In [ ]:
def dame_variables_categoricas(dataset=None):
    '''
    ----------------------------------------------------------------------------------------------------------
    Function dame_variables_categoricas:
    ----------------------------------------------------------------------------------------------------------
       -description: Function that receives a dataset and returns a list with the names of the 
        categorical variables.
        -Inputs: 
            -- dataset: Pandas dataframe containing the data.
        -Return:
        -- list_categorical_variables: list with the names of the categorical variables in the 
        input dataset that have fewer than 100 unique values.
        -- 1: execution is incorrect.
    '''
    if dataset is None:
        print(u'\There are no arguments to pass to the function.')
        return 1
    list_categorical_variables = []
    other = []
    for i in dataset.columns:
        if (dataset[i].dtype != float) & (dataset[i].dtype != int):
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 100:
                list_categorical_variables.append(i)
            else:
                other.append(i)
    num_categorical = len(list_categorical_variables)
    
    return list_categorical_variables, other, num_categorical

def dame_variables_continuas(dataset=None):
    '''
    ----------------------------------------------------------------------------------------------------------
    Function dame_variables_continuas:
    ----------------------------------------------------------------------------------------------------------
       -description: Function that receives a dataset and returns a list with the names of the 
        continuous variables.
        -Inputs: 
            -- dataset: Pandas dataframe containing the data.
        -Return:
        -- lista_continuous_variables: list with the names of the continuous variables in the 
        input dataset that have fewer than 100 unique values.
        -- 1: execution is incorrect.
    '''
    if dataset is None:
        print(u'\There are no arguments to pass to the function.')
        return 1
    
    list_continuous_variables = []
    other = []
    
    for i in dataset.columns:
        if dataset[i].dtype in [float, int]:
            unicos = dataset[i].nunique()
            if unicos >= 10:
                lista_continuous_variables.append(i)
            else:
                other.append(i)
    
    num_continuas = len(list_continuous_variables)
    
    return list_continuous_variables, other

def cramers_v(confusion_matrix):
    """ 
    calculate Cramers V statistic for categorial-categorial association.
    uses correction from Bergsma and Wicher,
    Journal of the Korean Statistical Society 42 (2013): 323-328
    
    confusion_matrix: tabla creada con pd.crosstab()
    
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

He creado las siguientes funciones para en un futuro utilizarlas en la creación de gráficos divididos por variables categóricas (plot_categorical_feature) y variables continuas (plot_continuous_feature) ya que la función proporcionada en el HTML no terminaba de cargar los gráficos y tras mucho tiempo (hasta 1h 30m corriendo el código) y por consejo de los docentes dividi la creación de gráficos en gráficos para las variables continuas y gráficos para las variables categóricas.

In [ ]:
def plot_categorical_feature(df, col_name, target):
    """
    Visualize a categorical variable with and without faceting on the target variable.
    - df: DataFrame containing the data
    - col_name: Name of the categorical variable to plot
    - target: The target variable for faceting
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    count_null = df[col_name].isnull().sum()
    
    # Category count display
    sns.countplot(x=df[col_name], order=sorted(df[col_name].dropna().unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(col_name)
    ax1.set_ylabel('Count')
    ax1.set_title(f'{col_name} - Número de nulos: {count_null}')
    plt.xticks(rotation=90)

    # Visualization of the distribution of the target variable according to the categorical variable.
    data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index()
    data.columns = [col_name, target, 'proportion']
    
    sns.barplot(x=col_name, y='proportion', hue=target, data=data, saturation=1, ax=ax2)
    ax2.set_ylabel(f'{target} fraction')
    ax2.set_title(f'{target} distribution by {col_name}')
    ax2.set_xlabel(col_name)
    plt.xticks(rotation=90)

    plt.tight_layout()
    
def plot_continuous_feature(df, col_name, target):
    """
    Visualize a continuous variable with and without faceting on the target variable.
    - df: DataFrame containing the data
    - col_name: Name of the continuous variable to plot
    - target: The target variable for faceting
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    count_null = df[col_name].isnull().sum()

    # Histogram display
    sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
    ax1.set_xlabel(col_name)
    ax1.set_ylabel('Count')
    ax1.set_title(f'{col_name} - Número de nulos: {count_null}')

    # Boxplot display for the continuous variable and target variable
    sns.boxplot(x=col_name, y=target, data=df, ax=ax2, hue=target, palette="Set2")  # Hue and palette are added
    ax2.set_ylabel('')
    ax2.set_title(f'{col_name} by {target}')
    ax2.set_xlabel(col_name)
    
    plt.tight_layout()

    """
    Calculates correlations between continuous variables and the target variable.
    
    :param df: DataFrame containing the data.
    :param target: Name of the target column (e.g., 'TARGET').
    :param list_var_cont: List of names of the continuous variables.
    
    :return: DataFrame with the correlations of the continuous variables with the target variable.
    """
    # Create a dictionary to store correlations
    correlations= {}
    
    # We go through each continuous variable in the list.
    for var in list_var_cont:
        # We calculate the Pearson correlation between the variable and the objective.
        correlacion = df[var].corr(df[target])
        # We store the correlation in the dictionary.
        correlations[var] = correlation
    
    # We convert the dictionary into a DataFrame for better visualization.
    correlations_df = pd.DataFrame(list(correlations.items()), columns=['variable', 'correlation'])
    
    # We order the DataFrame from highest to lowest according to the column 'Correlation'.
    correlations_df = correlations_df.sort_values(by='correlation', ascending=False)
    
    return correlations_df

Cargo todas las librerías que voy a necesitar así como descargo las que no había utilizado hasta ahora (!pip install scikit-learn) y que voy a utilizar en este cuaderno. A continuación añado las funciones que nos han dado los profesores y que se van a utilizar a lo largo de este cuaderno

Cargamos el dataset del notebook anterior a través del .csv creado al final de notebook 01. Al igual que el anterior dataframe, se utiliza una ruta relativa para la lectura de dataset en cualquier dispositivo que descargue el .zip. Además, se carga obviando una columna creada al exportarlo llamada '^Unnamed:' y se comprueba que tiene las dimensiones del que habiamos guardado.

In [ ]:
pd_loan = pd.read_csv("../data/pd_data_initial_preprocessing.csv")
pd_loan = pd_loan.loc[:, ~pd_loan.columns.str.contains('^Unnamed:')] #A column called 'Unnamed' had been created, possibly as a result of exporting it to .csv at the end of the previous notebook, so I deleted it.
pd_loan.shape
Out[ ]:
(307511, 122)

Comprobamos que tenemos las mismas columnas/variables:

In [6]:
pd_loan.columns
Out[6]:
Index(['COMMONAREA_MEDI', 'COMMONAREA_AVG', 'COMMONAREA_MODE',
       'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_AVG',
       'NONLIVINGAPARTMENTS_MEDI', 'FONDKAPREMONT_MODE',
       'LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_AVG',
       'LIVINGAPARTMENTS_MEDI',
       ...
       'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'DAYS_EMPLOYED', 'DAYS_BIRTH',
       'REGION_POPULATION_RELATIVE', 'NAME_HOUSING_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE', 'SK_ID_CURR'],
      dtype='object', length=122)

Utilizamos las dos funciones que nos devuelven la categoría de las variables y creamos las listas en este notebook.

In [7]:
list_var_cat, other_cat = dame_variables_categoricas(dataset=pd_loan)
pd_loan[list_var_cat] = pd_loan[list_var_cat].astype("category")
list_var_cont, other_cont = dame_variables_continuas(dataset=pd_loan)
pd_loan.dtypes
Out[7]:
COMMONAREA_MEDI                  float64
COMMONAREA_AVG                   float64
COMMONAREA_MODE                  float64
NONLIVINGAPARTMENTS_MODE         float64
NONLIVINGAPARTMENTS_AVG          float64
NONLIVINGAPARTMENTS_MEDI         float64
FONDKAPREMONT_MODE              category
LIVINGAPARTMENTS_MODE            float64
LIVINGAPARTMENTS_AVG             float64
LIVINGAPARTMENTS_MEDI            float64
FLOORSMIN_AVG                    float64
FLOORSMIN_MODE                   float64
FLOORSMIN_MEDI                   float64
YEARS_BUILD_MEDI                 float64
YEARS_BUILD_MODE                 float64
YEARS_BUILD_AVG                  float64
OWN_CAR_AGE                      float64
LANDAREA_MEDI                    float64
LANDAREA_MODE                    float64
LANDAREA_AVG                     float64
BASEMENTAREA_MEDI                float64
BASEMENTAREA_AVG                 float64
BASEMENTAREA_MODE                float64
EXT_SOURCE_1                     float64
NONLIVINGAREA_MODE               float64
NONLIVINGAREA_AVG                float64
NONLIVINGAREA_MEDI               float64
ELEVATORS_MEDI                   float64
ELEVATORS_AVG                    float64
ELEVATORS_MODE                   float64
WALLSMATERIAL_MODE              category
APARTMENTS_MEDI                  float64
APARTMENTS_AVG                   float64
APARTMENTS_MODE                  float64
ENTRANCES_MEDI                   float64
ENTRANCES_AVG                    float64
ENTRANCES_MODE                   float64
LIVINGAREA_AVG                   float64
LIVINGAREA_MODE                  float64
LIVINGAREA_MEDI                  float64
HOUSETYPE_MODE                  category
FLOORSMAX_MODE                   float64
FLOORSMAX_MEDI                   float64
FLOORSMAX_AVG                    float64
YEARS_BEGINEXPLUATATION_MODE     float64
YEARS_BEGINEXPLUATATION_MEDI     float64
YEARS_BEGINEXPLUATATION_AVG      float64
TOTALAREA_MODE                   float64
EMERGENCYSTATE_MODE             category
OCCUPATION_TYPE                 category
EXT_SOURCE_3                     float64
AMT_REQ_CREDIT_BUREAU_HOUR       float64
AMT_REQ_CREDIT_BUREAU_DAY        float64
AMT_REQ_CREDIT_BUREAU_WEEK       float64
AMT_REQ_CREDIT_BUREAU_MON        float64
AMT_REQ_CREDIT_BUREAU_QRT        float64
AMT_REQ_CREDIT_BUREAU_YEAR       float64
NAME_TYPE_SUITE                 category
OBS_30_CNT_SOCIAL_CIRCLE         float64
DEF_30_CNT_SOCIAL_CIRCLE         float64
OBS_60_CNT_SOCIAL_CIRCLE         float64
DEF_60_CNT_SOCIAL_CIRCLE         float64
EXT_SOURCE_2                     float64
AMT_GOODS_PRICE                  float64
AMT_ANNUITY                      float64
CNT_FAM_MEMBERS                  float64
DAYS_LAST_PHONE_CHANGE           float64
CNT_CHILDREN                    category
FLAG_DOCUMENT_8                 category
NAME_CONTRACT_TYPE              category
CODE_GENDER                     category
FLAG_OWN_CAR                    category
FLAG_DOCUMENT_2                 category
FLAG_DOCUMENT_3                 category
FLAG_DOCUMENT_4                 category
FLAG_DOCUMENT_5                 category
FLAG_DOCUMENT_6                 category
FLAG_DOCUMENT_7                 category
FLAG_DOCUMENT_9                 category
FLAG_DOCUMENT_21                category
FLAG_DOCUMENT_10                category
FLAG_DOCUMENT_11                category
FLAG_OWN_REALTY                 category
FLAG_DOCUMENT_13                category
FLAG_DOCUMENT_14                category
FLAG_DOCUMENT_15                category
FLAG_DOCUMENT_16                category
FLAG_DOCUMENT_17                category
FLAG_DOCUMENT_18                category
FLAG_DOCUMENT_19                category
FLAG_DOCUMENT_20                category
FLAG_DOCUMENT_12                category
AMT_CREDIT                       float64
AMT_INCOME_TOTAL                 float64
FLAG_PHONE                      category
LIVE_CITY_NOT_WORK_CITY         category
REG_CITY_NOT_WORK_CITY          category
TARGET                          category
REG_CITY_NOT_LIVE_CITY          category
LIVE_REGION_NOT_WORK_REGION     category
REG_REGION_NOT_WORK_REGION      category
REG_REGION_NOT_LIVE_REGION      category
HOUR_APPR_PROCESS_START         category
WEEKDAY_APPR_PROCESS_START      category
REGION_RATING_CLIENT_W_CITY     category
REGION_RATING_CLIENT            category
FLAG_EMAIL                      category
FLAG_CONT_MOBILE                category
ORGANIZATION_TYPE               category
FLAG_WORK_PHONE                 category
FLAG_EMP_PHONE                  category
FLAG_MOBIL                      category
DAYS_ID_PUBLISH                    int64
DAYS_REGISTRATION                float64
DAYS_EMPLOYED                      int64
DAYS_BIRTH                         int64
REGION_POPULATION_RELATIVE       float64
NAME_HOUSING_TYPE               category
NAME_FAMILY_STATUS              category
NAME_EDUCATION_TYPE             category
NAME_INCOME_TYPE                category
SK_ID_CURR                         int64
dtype: object

Vemos que variables obtenemos como categóricas

In [8]:
print(list_var_cat)
len(list_var_cat)
['FONDKAPREMONT_MODE', 'WALLSMATERIAL_MODE', 'HOUSETYPE_MODE', 'EMERGENCYSTATE_MODE', 'OCCUPATION_TYPE', 'NAME_TYPE_SUITE', 'CNT_CHILDREN', 'FLAG_DOCUMENT_8', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_21', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_OWN_REALTY', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_12', 'FLAG_PHONE', 'LIVE_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_WORK_CITY', 'TARGET', 'REG_CITY_NOT_LIVE_CITY', 'LIVE_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_LIVE_REGION', 'HOUR_APPR_PROCESS_START', 'WEEKDAY_APPR_PROCESS_START', 'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT', 'FLAG_EMAIL', 'FLAG_CONT_MOBILE', 'ORGANIZATION_TYPE', 'FLAG_WORK_PHONE', 'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'NAME_HOUSING_TYPE', 'NAME_FAMILY_STATUS', 'NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE']
Out[8]:
53

Vemos que variables obtenemos como continuas

In [9]:
print(list_var_cont)
len(list_var_cont)
['COMMONAREA_MEDI', 'COMMONAREA_AVG', 'COMMONAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAPARTMENTS_MEDI', 'LIVINGAPARTMENTS_MODE', 'LIVINGAPARTMENTS_AVG', 'LIVINGAPARTMENTS_MEDI', 'FLOORSMIN_AVG', 'FLOORSMIN_MODE', 'FLOORSMIN_MEDI', 'YEARS_BUILD_MEDI', 'YEARS_BUILD_MODE', 'YEARS_BUILD_AVG', 'OWN_CAR_AGE', 'LANDAREA_MEDI', 'LANDAREA_MODE', 'LANDAREA_AVG', 'BASEMENTAREA_MEDI', 'BASEMENTAREA_AVG', 'BASEMENTAREA_MODE', 'EXT_SOURCE_1', 'NONLIVINGAREA_MODE', 'NONLIVINGAREA_AVG', 'NONLIVINGAREA_MEDI', 'ELEVATORS_MEDI', 'ELEVATORS_AVG', 'ELEVATORS_MODE', 'APARTMENTS_MEDI', 'APARTMENTS_AVG', 'APARTMENTS_MODE', 'ENTRANCES_MEDI', 'ENTRANCES_AVG', 'ENTRANCES_MODE', 'LIVINGAREA_AVG', 'LIVINGAREA_MODE', 'LIVINGAREA_MEDI', 'FLOORSMAX_MODE', 'FLOORSMAX_MEDI', 'FLOORSMAX_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BEGINEXPLUATATION_AVG', 'TOTALAREA_MODE', 'EXT_SOURCE_3', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'EXT_SOURCE_2', 'AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_FAM_MEMBERS', 'DAYS_LAST_PHONE_CHANGE', 'AMT_CREDIT', 'AMT_INCOME_TOTAL', 'DAYS_REGISTRATION', 'REGION_POPULATION_RELATIVE']
Out[9]:
61
In [10]:
print(other_cat+other_cont)
['DAYS_ID_PUBLISH', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'SK_ID_CURR', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'DEF_60_CNT_SOCIAL_CIRCLE']

En total tenemos 122 variables, 53 categóricas, 61 continuas y 8 'other' al igual que en el dataset del notebook anterior

Graficamos de nuevo la distribución de la variable objetivo entre aquellos que tienen dificultades para pagar (1) y aquellos que no (0)

In [ ]:
pd_plot_loan_status = pd_loan['TARGET']\
        .value_counts(normalize=True)\
        .mul(100).rename('percent').reset_index()

# Calculate TARGET counts
pd_plot_loan_status_count = pd_loan['TARGET'].value_counts().reset_index()

# Make the merge
pd_plot_loan_status_pc = pd.merge(pd_plot_loan_status, 
                                  pd_plot_loan_status_count, 
                                  on='TARGET', how='inner')

# Plot
fig = px.histogram(pd_plot_loan_status_pc, x="TARGET", y="percent")
fig.show()

Realizamos la división del dataset en train, el cual estara conformado por el 80% de los datos, y test, conformado por el 20% restante, además se utiliza la sintaxis "stratify=pd_loan['TARGET']" para que divida proporcionalmente las muestras para obtener el mismo % de 1 y 0 tanto en train como en test

In [12]:
from sklearn.model_selection import train_test_split
x_pd_loan, x_pd_loan_test, y_pd_loan, y_pd_loan_test = train_test_split(pd_loan.drop('TARGET',axis=1), 
                                                                     pd_loan['TARGET'], 
                                                                     stratify=pd_loan['TARGET'], 
                                                                     test_size=0.2)
pd_loan_train = pd.concat([x_pd_loan, y_pd_loan],axis=1)
pd_loan_test = pd.concat([x_pd_loan_test, y_pd_loan_test],axis=1)

Comprobamos que ha funcionado el stratify, y tanto el train como el test tienen la misma proporcion de 0 y 1 (0.919 para los 0 y 0.80 para los 1)

In [13]:
print('== Train\n', pd_loan_train['TARGET'].value_counts(normalize=True))
print('== Test\n', pd_loan_test['TARGET'].value_counts(normalize=True))
== Train
 TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64
== Test
 TARGET
0    0.919272
1    0.080728
Name: proportion, dtype: float64
In [14]:
pd_series_null_columns = pd_loan_train.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = pd_loan_train.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)
(122,) (246008,)
In [ ]:
pd_null_columns = pd.DataFrame(pd_series_null_columns, columns=['columns_nulls'])     
pd_null_rows = pd.DataFrame(pd_series_null_rows, columns=['row_nulls'])  
pd_null_rows['target'] = pd_loan['TARGET'].copy()
pd_null_columns['columns_percentage'] = pd_null_columns['columns_nulls']/pd_loan_train.shape[0]
pd_null_rows['rows_percentage']= pd_null_rows['row_nulls']/pd_loan_train.shape[1]

El nuevo dataset de 'train' tiene una dimensión de 246008 filas para las 122 variables del dataset y de nuevo, a continuación volvemos a ver el porcentaje de nulos que existe por cada columna del dataframe.

In [ ]:
pd_null_columns
Out[ ]:
nulos_columnas porcentaje_columnas
COMMONAREA_MEDI 171972 0.699050
COMMONAREA_MODE 171972 0.699050
COMMONAREA_AVG 171972 0.699050
NONLIVINGAPARTMENTS_MODE 170909 0.694729
NONLIVINGAPARTMENTS_MEDI 170909 0.694729
NONLIVINGAPARTMENTS_AVG 170909 0.694729
FONDKAPREMONT_MODE 168327 0.684234
LIVINGAPARTMENTS_AVG 168277 0.684031
LIVINGAPARTMENTS_MEDI 168277 0.684031
LIVINGAPARTMENTS_MODE 168277 0.684031
FLOORSMIN_AVG 166993 0.678811
FLOORSMIN_MODE 166993 0.678811
FLOORSMIN_MEDI 166993 0.678811
YEARS_BUILD_MODE 163682 0.665352
YEARS_BUILD_AVG 163682 0.665352
YEARS_BUILD_MEDI 163682 0.665352
OWN_CAR_AGE 162223 0.659422
LANDAREA_MODE 146053 0.593692
LANDAREA_AVG 146053 0.593692
LANDAREA_MEDI 146053 0.593692
BASEMENTAREA_MEDI 143994 0.585322
BASEMENTAREA_AVG 143994 0.585322
BASEMENTAREA_MODE 143994 0.585322
EXT_SOURCE_1 138885 0.564555
NONLIVINGAREA_MODE 135759 0.551848
NONLIVINGAREA_AVG 135759 0.551848
NONLIVINGAREA_MEDI 135759 0.551848
ELEVATORS_AVG 131138 0.533064
ELEVATORS_MODE 131138 0.533064
ELEVATORS_MEDI 131138 0.533064
WALLSMATERIAL_MODE 125093 0.508492
APARTMENTS_MEDI 124856 0.507528
APARTMENTS_AVG 124856 0.507528
APARTMENTS_MODE 124856 0.507528
ENTRANCES_MEDI 123897 0.503630
ENTRANCES_AVG 123897 0.503630
ENTRANCES_MODE 123897 0.503630
LIVINGAREA_AVG 123502 0.502024
LIVINGAREA_MODE 123502 0.502024
LIVINGAREA_MEDI 123502 0.502024
HOUSETYPE_MODE 123479 0.501931
FLOORSMAX_MODE 122449 0.497744
FLOORSMAX_MEDI 122449 0.497744
FLOORSMAX_AVG 122449 0.497744
YEARS_BEGINEXPLUATATION_AVG 120063 0.488045
YEARS_BEGINEXPLUATATION_MODE 120063 0.488045
YEARS_BEGINEXPLUATATION_MEDI 120063 0.488045
TOTALAREA_MODE 118781 0.482834
EMERGENCYSTATE_MODE 116618 0.474041
OCCUPATION_TYPE 77177 0.313717
EXT_SOURCE_3 48916 0.198839
AMT_REQ_CREDIT_BUREAU_HOUR 33287 0.135309
AMT_REQ_CREDIT_BUREAU_DAY 33287 0.135309
AMT_REQ_CREDIT_BUREAU_WEEK 33287 0.135309
AMT_REQ_CREDIT_BUREAU_MON 33287 0.135309
AMT_REQ_CREDIT_BUREAU_QRT 33287 0.135309
AMT_REQ_CREDIT_BUREAU_YEAR 33287 0.135309
NAME_TYPE_SUITE 1035 0.004207
OBS_60_CNT_SOCIAL_CIRCLE 819 0.003329
DEF_60_CNT_SOCIAL_CIRCLE 819 0.003329
DEF_30_CNT_SOCIAL_CIRCLE 819 0.003329
OBS_30_CNT_SOCIAL_CIRCLE 819 0.003329
EXT_SOURCE_2 543 0.002207
AMT_GOODS_PRICE 217 0.000882
AMT_ANNUITY 9 0.000037
CNT_FAM_MEMBERS 2 0.000008
DAYS_LAST_PHONE_CHANGE 1 0.000004
HOUR_APPR_PROCESS_START 0 0.000000
FLAG_EMAIL 0 0.000000
REGION_RATING_CLIENT 0 0.000000
REGION_RATING_CLIENT_W_CITY 0 0.000000
WEEKDAY_APPR_PROCESS_START 0 0.000000
REG_CITY_NOT_LIVE_CITY 0 0.000000
REG_REGION_NOT_LIVE_REGION 0 0.000000
REG_REGION_NOT_WORK_REGION 0 0.000000
LIVE_REGION_NOT_WORK_REGION 0 0.000000
ORGANIZATION_TYPE 0 0.000000
REG_CITY_NOT_WORK_CITY 0 0.000000
FLAG_CONT_MOBILE 0 0.000000
FLAG_MOBIL 0 0.000000
FLAG_WORK_PHONE 0 0.000000
FLAG_EMP_PHONE 0 0.000000
FLAG_PHONE 0 0.000000
DAYS_ID_PUBLISH 0 0.000000
DAYS_REGISTRATION 0 0.000000
DAYS_EMPLOYED 0 0.000000
DAYS_BIRTH 0 0.000000
REGION_POPULATION_RELATIVE 0 0.000000
NAME_HOUSING_TYPE 0 0.000000
NAME_FAMILY_STATUS 0 0.000000
NAME_EDUCATION_TYPE 0 0.000000
NAME_INCOME_TYPE 0 0.000000
SK_ID_CURR 0 0.000000
LIVE_CITY_NOT_WORK_CITY 0 0.000000
FLAG_DOCUMENT_12 0 0.000000
AMT_INCOME_TOTAL 0 0.000000
FLAG_DOCUMENT_9 0 0.000000
CNT_CHILDREN 0 0.000000
FLAG_DOCUMENT_8 0 0.000000
NAME_CONTRACT_TYPE 0 0.000000
CODE_GENDER 0 0.000000
FLAG_OWN_CAR 0 0.000000
FLAG_DOCUMENT_2 0 0.000000
FLAG_DOCUMENT_3 0 0.000000
FLAG_DOCUMENT_4 0 0.000000
FLAG_DOCUMENT_5 0 0.000000
FLAG_DOCUMENT_6 0 0.000000
FLAG_DOCUMENT_7 0 0.000000
FLAG_DOCUMENT_21 0 0.000000
AMT_CREDIT 0 0.000000
FLAG_DOCUMENT_10 0 0.000000
FLAG_DOCUMENT_11 0 0.000000
FLAG_OWN_REALTY 0 0.000000
FLAG_DOCUMENT_13 0 0.000000
FLAG_DOCUMENT_14 0 0.000000
FLAG_DOCUMENT_15 0 0.000000
FLAG_DOCUMENT_16 0 0.000000
FLAG_DOCUMENT_17 0 0.000000
FLAG_DOCUMENT_18 0 0.000000
FLAG_DOCUMENT_19 0 0.000000
FLAG_DOCUMENT_20 0 0.000000
TARGET 0 0.000000
In [ ]:
pd_null_rows.head()
Out[ ]:
nulos_filas target porcentaje_filas
150206 61 0 0.5
185713 61 0 0.5
269492 61 0 0.5
269786 61 0 0.5
116937 61 0 0.5
In [18]:
f_aux.dame_variables_categoricas(pd_loan_train)
Out[18]:
(['FONDKAPREMONT_MODE',
  'WALLSMATERIAL_MODE',
  'HOUSETYPE_MODE',
  'EMERGENCYSTATE_MODE',
  'OCCUPATION_TYPE',
  'NAME_TYPE_SUITE',
  'CNT_CHILDREN',
  'FLAG_DOCUMENT_8',
  'NAME_CONTRACT_TYPE',
  'CODE_GENDER',
  'FLAG_OWN_CAR',
  'FLAG_DOCUMENT_2',
  'FLAG_DOCUMENT_3',
  'FLAG_DOCUMENT_4',
  'FLAG_DOCUMENT_5',
  'FLAG_DOCUMENT_6',
  'FLAG_DOCUMENT_7',
  'FLAG_DOCUMENT_9',
  'FLAG_DOCUMENT_21',
  'FLAG_DOCUMENT_10',
  'FLAG_DOCUMENT_11',
  'FLAG_OWN_REALTY',
  'FLAG_DOCUMENT_13',
  'FLAG_DOCUMENT_14',
  'FLAG_DOCUMENT_15',
  'FLAG_DOCUMENT_16',
  'FLAG_DOCUMENT_17',
  'FLAG_DOCUMENT_18',
  'FLAG_DOCUMENT_19',
  'FLAG_DOCUMENT_20',
  'FLAG_DOCUMENT_12',
  'FLAG_PHONE',
  'LIVE_CITY_NOT_WORK_CITY',
  'REG_CITY_NOT_WORK_CITY',
  'REG_CITY_NOT_LIVE_CITY',
  'LIVE_REGION_NOT_WORK_REGION',
  'REG_REGION_NOT_WORK_REGION',
  'REG_REGION_NOT_LIVE_REGION',
  'HOUR_APPR_PROCESS_START',
  'WEEKDAY_APPR_PROCESS_START',
  'REGION_RATING_CLIENT_W_CITY',
  'REGION_RATING_CLIENT',
  'FLAG_EMAIL',
  'FLAG_CONT_MOBILE',
  'ORGANIZATION_TYPE',
  'FLAG_WORK_PHONE',
  'FLAG_EMP_PHONE',
  'FLAG_MOBIL',
  'NAME_HOUSING_TYPE',
  'NAME_FAMILY_STATUS',
  'NAME_EDUCATION_TYPE',
  'NAME_INCOME_TYPE',
  'TARGET'],
 ['DAYS_ID_PUBLISH', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'SK_ID_CURR'],
 53)
In [19]:
f_aux.dame_variables_continuas(pd_loan_train)
Out[19]:
(['COMMONAREA_MEDI',
  'COMMONAREA_AVG',
  'COMMONAREA_MODE',
  'NONLIVINGAPARTMENTS_MODE',
  'NONLIVINGAPARTMENTS_AVG',
  'NONLIVINGAPARTMENTS_MEDI',
  'LIVINGAPARTMENTS_MODE',
  'LIVINGAPARTMENTS_AVG',
  'LIVINGAPARTMENTS_MEDI',
  'FLOORSMIN_AVG',
  'FLOORSMIN_MODE',
  'FLOORSMIN_MEDI',
  'YEARS_BUILD_MEDI',
  'YEARS_BUILD_MODE',
  'YEARS_BUILD_AVG',
  'OWN_CAR_AGE',
  'LANDAREA_MEDI',
  'LANDAREA_MODE',
  'LANDAREA_AVG',
  'BASEMENTAREA_MEDI',
  'BASEMENTAREA_AVG',
  'BASEMENTAREA_MODE',
  'EXT_SOURCE_1',
  'NONLIVINGAREA_MODE',
  'NONLIVINGAREA_AVG',
  'NONLIVINGAREA_MEDI',
  'ELEVATORS_MEDI',
  'ELEVATORS_AVG',
  'ELEVATORS_MODE',
  'APARTMENTS_MEDI',
  'APARTMENTS_AVG',
  'APARTMENTS_MODE',
  'ENTRANCES_MEDI',
  'ENTRANCES_AVG',
  'ENTRANCES_MODE',
  'LIVINGAREA_AVG',
  'LIVINGAREA_MODE',
  'LIVINGAREA_MEDI',
  'FLOORSMAX_MODE',
  'FLOORSMAX_MEDI',
  'FLOORSMAX_AVG',
  'YEARS_BEGINEXPLUATATION_MODE',
  'YEARS_BEGINEXPLUATATION_MEDI',
  'YEARS_BEGINEXPLUATATION_AVG',
  'TOTALAREA_MODE',
  'EXT_SOURCE_3',
  'AMT_REQ_CREDIT_BUREAU_MON',
  'AMT_REQ_CREDIT_BUREAU_QRT',
  'AMT_REQ_CREDIT_BUREAU_YEAR',
  'OBS_30_CNT_SOCIAL_CIRCLE',
  'OBS_60_CNT_SOCIAL_CIRCLE',
  'EXT_SOURCE_2',
  'AMT_GOODS_PRICE',
  'AMT_ANNUITY',
  'CNT_FAM_MEMBERS',
  'DAYS_LAST_PHONE_CHANGE',
  'AMT_CREDIT',
  'AMT_INCOME_TOTAL',
  'DAYS_REGISTRATION',
  'REGION_POPULATION_RELATIVE'],
 ['AMT_REQ_CREDIT_BUREAU_HOUR',
  'AMT_REQ_CREDIT_BUREAU_DAY',
  'AMT_REQ_CREDIT_BUREAU_WEEK',
  'DEF_30_CNT_SOCIAL_CIRCLE',
  'DEF_60_CNT_SOCIAL_CIRCLE'],
 60)

A continuación se realizan los gráficos para comparar las variables continuas con la variable objetivo 'TARGET'

In [ ]:
# Suprimir advertencias
warnings.filterwarnings('ignore')

# Traverse all columns of train and make graphs only for continuous variables
for i in list(pd_loan_train.columns):
    # Check if the column is continuous (dtype float or int)
    if (pd_loan_train[i].dtype in ['float64', 'int64']) & (i != 'TARGET'):
        # Call function for continuous variables
        plot_continuous_feature(pd_loan_train, col_name=i, target='TARGET')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

A continuación se realizan los gráficos para comparar las variables categóricas con la variable objetivo 'TARGET'

In [ ]:
warnings.filterwarnings('ignore')
for i in list(pd_loan_train.columns):
    # Checks if the column is not continuous and is not 'TARGET'.
    if (pd_loan_train[i].dtype in ['object', 'category']) & (i != 'TARGET'):
        # Call function for categorical variables
        plot_categorical_feature(pd_loan_train, col_name=i, target='TARGET')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

En cuanto a los gráficos podemos sacar y siendo que: Comentarios de gráficos EDA: 1- CLIENTE CON DIFICULTADES 0- CLIENTE SIN DIFICULTADES

--------------VARIABLES CONTINUAS--------------

-VARIABLE: AMT_REQ_CREDIT_BUREAU: Vemos una diferencia significante entre el número de consultas a la oficina de crédito ('CREDIT_BUREAU') sobre el cliente un día al año (excluidos los 3 últimos meses antes de la solicitud) entre los que han tenido dificultades y los que no. Se puede observar como aquellos que han tenido mayor dificultad para pagar su respectivo crédito son más probables de ser objetivo de consulta a la oficina de crédito

-VARIABLE: DAYS_BIRTH: En cuanto a la edad, parece ser también un indicativo de dificultades para el pago siendo que los clientes más jóvenes tienen más probabilidad de tener dificultades frente al pago del crédito. Vemos como la mediana de clientes con dificultades es de aproximadamente 38 años mientras que la de clientes que no tuvieron dificultades es mayor a los 43 años.

-VARIABLE: OWN_CAR_AGE: Esta variable nos indica la edad que tiene el coche del cliente, en este caso vemos como la mediana de edad del coche del cliente para los que tuvieron dificultades de pago es mayor, podriamos decir que tener un coche más antiguo puede ser significante a la hora de tener dificultades con los pagos, esto se puede deber a que los que no han tenido dificultades puedan permitirse coches más nuevos. La mediana para los que no tuvieron dificultades se encuentra aproximadamente en los 9 años de edad mientras que la de los que si tuvieron se aproxima más a 11 años.

-VARIABLES: EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3: Estas tres variables que aparentemente recojen "una puntuación o score normalizada de una fuente de datos externa" aparentemente la mediana para los que tuvieron dificultades es bastante inferior (casi 0.2 puntos sobre 1 en los 3 casos) por lo que es probable que en tuvieran scores o puntuaciones más bajas en otras fuentes de datos y eso afecte a la variable 'TARGET'.

-VARIABLE: DAYS_LAST_PHONE_CHANGE: Aparentemente aquellos que cambiaron de móvil más recientemente han tenido más dificultades para realizar los pagos, entrando en suposiciones, esto puede deberse diferentes razones, como se me ocurre que sean inmigrantes de otros países que no tienen tantas oportunidades y que necesiten un nuevo móvil para solicitar el crédito entre otras.

-VARIABLE: DAYS_ID_PUBLISH: Sucede algo similar a la variable del telefono móvil. Cuanto menos tiempo haya pasado desde la última actualización de su documento de identidad. Puede también estar relacionado con alguien que ha obtenido una nueva visa y busca un crédito para empezar en otro país o similares.

-VARIABLE: DAYS_REGISTRATION: Pese a que aparentemente la mediana no difiere tanto, se debe a los outliers que alteran la escala del gráfico y las variables van de 5000 en 5000, pero la distancia es similar en proporción a la de anteriores variables. Esta variable recoge hace cuantos días el cliente cambio su registro, aparentemente los que tienen más dificultades lo cambiaron más recientemente.

--------------VARIABLES CATEGÓRICAS--------------

-VARIABLE OCCUPATION_TYPE: Dentro del trabajo que realizaban los tomadores del crédito observamos como hay algunas profesiones que tienen significantemente más dificultades para realizar el pago, estas son: -Low Skills Laborers -Waiters/barmen staff -Drivers En conclusión, los trabajadores con menos cualificación tienen más probabilidad de tener dificultades al realizar los pagos, en cambio los trabajadores más cualificados, en proporción, tienden a tener menos problemas, como observamso en empleos como: -High Skill Tech Staff -Accountants -Medicine Staff

VARIABLE NAME_CONTRACT_TYPE: El tipo de contrato parece que afecta levemente a las dificultades a la hora de pagar, los “Cash Loans” o créditos tradicionales parecen generar más dificultades, esto podría deberse a la cuota fija mensual y que no siempre los tomadores del crédito lleguen a ella, el carácter mixto del “Revolving Loan” hace que si un mes un cliente vaya más apurado y su gasto sea menor sea más difícil de devolver al ser una cantidad menor.

VARIABLE: FLAG_DOCUMENT_2: Suponiendo que 0 es entregado y 1 no entregado, podemos ver que dentro de aquellos que no lo entregaron, una gran parte han tenido dificultades para pagar el préstamo por lo que sería interesante pedir antes de aprobar un préstamo este documento ya que aparentemente influye en la capacidad de luego devolver los pagos

VARIABLE: WALLSMATERIAL_MODE: Esta variable recoge de que estan hechas las paredes de la vivienda del cliente, pareces ser que aquellas viviendas hechas de materiales más frágiles, vease madera o mixtos (intuyendo como mixtos una mezcla de materiales de bajo coste) tiene mayores dificultades a la hora de realizar los pagos.

VARIABLE: NAME_HOUSING_TYPE: Esta variable recoge donde o con quien convive el tomador del préstamo, la gran mayoría vive en una casa/apartamento, pero si es verdad que, de los que viven en un apartamento alquilado o con sus padres muestran más dificultad a la hora de realizar los pagos, esto se puede deber a que parte del dinero se va en el alquiler o en el caso de los que viven con sus padres que son estudiantes o no se pueden permitir ni si quiera un alquiler.

VARIABLE: NAME_INCOME_TYPE: En cuanto a la variable que nos dice de donde provienen sus ingresos, la gran mayoría de sujetos se encuentran trabajando, son asociados, pensionistas o trabajadores del estado, aquellos que parecen tener más dificultad son los desempleados, cosa que se podía preveer, y las bajas de maternidad al ver disminuidos los ingresos percibidos por la baja,

In [22]:
f_aux.get_deviation_of_mean_perc(pd_loan_train, list_var_cont, target='TARGET', multiplier=3)
Out[22]:
0.0 1.0 variable sum_outlier_values porcentaje_sum_null_values
0 0.000000 1.000000 COMMONAREA_MEDI 1371 0.005573
1 0.944566 0.055434 COMMONAREA_MEDI 1371 0.005573
2 0.000000 1.000000 COMMONAREA_AVG 1359 0.005524
3 0.944812 0.055188 COMMONAREA_AVG 1359 0.005524
4 0.000000 1.000000 COMMONAREA_MODE 1352 0.005496
5 0.943047 0.056953 COMMONAREA_MODE 1352 0.005496
6 0.000000 1.000000 NONLIVINGAPARTMENTS_MODE 533 0.002167
7 0.930582 0.069418 NONLIVINGAPARTMENTS_MODE 533 0.002167
8 0.000000 1.000000 NONLIVINGAPARTMENTS_AVG 567 0.002305
9 0.938272 0.061728 NONLIVINGAPARTMENTS_AVG 567 0.002305
10 0.000000 1.000000 NONLIVINGAPARTMENTS_MEDI 569 0.002313
11 0.936731 0.063269 NONLIVINGAPARTMENTS_MEDI 569 0.002313
12 0.000000 1.000000 LIVINGAPARTMENTS_MODE 1441 0.005858
13 0.943095 0.056905 LIVINGAPARTMENTS_MODE 1441 0.005858
14 0.000000 1.000000 LIVINGAPARTMENTS_AVG 1395 0.005671
15 0.946953 0.053047 LIVINGAPARTMENTS_AVG 1395 0.005671
16 0.000000 1.000000 LIVINGAPARTMENTS_MEDI 1405 0.005711
17 0.945907 0.054093 LIVINGAPARTMENTS_MEDI 1405 0.005711
18 0.000000 1.000000 FLOORSMIN_AVG 475 0.001931
19 0.960000 0.040000 FLOORSMIN_AVG 475 0.001931
20 0.000000 1.000000 FLOORSMIN_MODE 388 0.001577
21 0.958763 0.041237 FLOORSMIN_MODE 388 0.001577
22 0.000000 1.000000 FLOORSMIN_MEDI 448 0.001821
23 0.959821 0.040179 FLOORSMIN_MEDI 448 0.001821
24 0.000000 1.000000 YEARS_BUILD_MEDI 959 0.003898
25 0.921794 0.078206 YEARS_BUILD_MEDI 959 0.003898
26 0.000000 1.000000 YEARS_BUILD_MODE 957 0.003890
27 0.920585 0.079415 YEARS_BUILD_MODE 957 0.003890
28 0.000000 1.000000 YEARS_BUILD_AVG 947 0.003849
29 0.920803 0.079197 YEARS_BUILD_AVG 947 0.003849
30 0.000000 1.000000 OWN_CAR_AGE 2686 0.010918
31 0.918466 0.081534 OWN_CAR_AGE 2686 0.010918
32 0.000000 1.000000 LANDAREA_MEDI 1733 0.007044
33 0.937680 0.062320 LANDAREA_MEDI 1733 0.007044
34 0.000000 1.000000 LANDAREA_MODE 1740 0.007073
35 0.935057 0.064943 LANDAREA_MODE 1740 0.007073
36 0.000000 1.000000 LANDAREA_AVG 1713 0.006963
37 0.936953 0.063047 LANDAREA_AVG 1713 0.006963
38 0.000000 1.000000 BASEMENTAREA_MEDI 1596 0.006488
39 0.949875 0.050125 BASEMENTAREA_MEDI 1596 0.006488
40 0.000000 1.000000 BASEMENTAREA_AVG 1575 0.006402
41 0.949841 0.050159 BASEMENTAREA_AVG 1575 0.006402
42 0.000000 1.000000 BASEMENTAREA_MODE 1648 0.006699
43 0.949029 0.050971 BASEMENTAREA_MODE 1648 0.006699
44 0.000000 1.000000 NONLIVINGAREA_MODE 1992 0.008097
45 0.950301 0.049699 NONLIVINGAREA_MODE 1992 0.008097
46 0.000000 1.000000 NONLIVINGAREA_AVG 1965 0.007988
47 0.949109 0.050891 NONLIVINGAREA_AVG 1965 0.007988
48 0.000000 1.000000 NONLIVINGAREA_MEDI 1974 0.008024
49 0.949848 0.050152 NONLIVINGAREA_MEDI 1974 0.008024
50 0.000000 1.000000 ELEVATORS_MEDI 1915 0.007784
51 0.953525 0.046475 ELEVATORS_MEDI 1915 0.007784
52 0.000000 1.000000 ELEVATORS_AVG 1924 0.007821
53 0.953742 0.046258 ELEVATORS_AVG 1924 0.007821
54 0.000000 1.000000 ELEVATORS_MODE 2677 0.010882
55 0.948450 0.051550 ELEVATORS_MODE 2677 0.010882
56 0.000000 1.000000 APARTMENTS_MEDI 2447 0.009947
57 0.948100 0.051900 APARTMENTS_MEDI 2447 0.009947
58 0.000000 1.000000 APARTMENTS_AVG 2398 0.009748
59 0.948707 0.051293 APARTMENTS_AVG 2398 0.009748
60 0.000000 1.000000 APARTMENTS_MODE 2404 0.009772
61 0.948003 0.051997 APARTMENTS_MODE 2404 0.009772
62 0.000000 1.000000 ENTRANCES_MEDI 1758 0.007146
63 0.937429 0.062571 ENTRANCES_MEDI 1758 0.007146
64 0.000000 1.000000 ENTRANCES_AVG 1747 0.007101
65 0.938180 0.061820 ENTRANCES_AVG 1747 0.007101
66 0.000000 1.000000 ENTRANCES_MODE 2072 0.008422
67 0.940154 0.059846 ENTRANCES_MODE 2072 0.008422
68 0.000000 1.000000 LIVINGAREA_AVG 2576 0.010471
69 0.947981 0.052019 LIVINGAREA_AVG 2576 0.010471
70 0.000000 1.000000 LIVINGAREA_MODE 2699 0.010971
71 0.945165 0.054835 LIVINGAREA_MODE 2699 0.010971
72 0.000000 1.000000 LIVINGAREA_MEDI 2587 0.010516
73 0.948203 0.051797 LIVINGAREA_MEDI 2587 0.010516
74 0.000000 1.000000 FLOORSMAX_MODE 2127 0.008646
75 0.962388 0.037612 FLOORSMAX_MODE 2127 0.008646
76 0.000000 1.000000 FLOORSMAX_MEDI 2201 0.008947
77 0.960927 0.039073 FLOORSMAX_MEDI 2201 0.008947
78 0.000000 1.000000 FLOORSMAX_AVG 2102 0.008544
79 0.961465 0.038535 FLOORSMAX_AVG 2102 0.008544
80 0.000000 1.000000 YEARS_BEGINEXPLUATATION_MODE 552 0.002244
81 0.911232 0.088768 YEARS_BEGINEXPLUATATION_MODE 552 0.002244
82 0.000000 1.000000 YEARS_BEGINEXPLUATATION_MEDI 527 0.002142
83 0.908918 0.091082 YEARS_BEGINEXPLUATATION_MEDI 527 0.002142
84 0.000000 1.000000 YEARS_BEGINEXPLUATATION_AVG 561 0.002280
85 0.912656 0.087344 YEARS_BEGINEXPLUATATION_AVG 561 0.002280
86 0.000000 1.000000 TOTALAREA_MODE 2674 0.010870
87 0.953628 0.046372 TOTALAREA_MODE 2674 0.010870
88 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_MON 2609 0.010605
89 0.945573 0.054427 AMT_REQ_CREDIT_BUREAU_MON 2609 0.010605
90 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_QRT 1857 0.007549
91 0.920840 0.079160 AMT_REQ_CREDIT_BUREAU_QRT 1857 0.007549
92 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_YEAR 2715 0.011036
93 0.906077 0.093923 AMT_REQ_CREDIT_BUREAU_YEAR 2715 0.011036
94 0.000000 1.000000 OBS_30_CNT_SOCIAL_CIRCLE 4934 0.020056
95 0.910620 0.089380 OBS_30_CNT_SOCIAL_CIRCLE 4934 0.020056
96 0.000000 1.000000 DEF_30_CNT_SOCIAL_CIRCLE 5460 0.022194
97 0.881319 0.118681 DEF_30_CNT_SOCIAL_CIRCLE 5460 0.022194
98 0.000000 1.000000 OBS_60_CNT_SOCIAL_CIRCLE 4796 0.019495
99 0.910342 0.089658 OBS_60_CNT_SOCIAL_CIRCLE 4796 0.019495
100 0.000000 1.000000 AMT_GOODS_PRICE 3304 0.013430
101 0.961864 0.038136 AMT_GOODS_PRICE 3304 0.013430
102 0.000000 1.000000 AMT_ANNUITY 2352 0.009561
103 0.963010 0.036990 AMT_ANNUITY 2352 0.009561
104 0.000000 1.000000 CNT_FAM_MEMBERS 3240 0.013170
105 0.901235 0.098765 CNT_FAM_MEMBERS 3240 0.013170
106 0.000000 1.000000 DAYS_LAST_PHONE_CHANGE 510 0.002073
107 0.956863 0.043137 DAYS_LAST_PHONE_CHANGE 510 0.002073
108 0.000000 1.000000 AMT_CREDIT 2588 0.010520
109 0.959042 0.040958 AMT_CREDIT 2588 0.010520
110 0.000000 1.000000 AMT_INCOME_TOTAL 224 0.000911
111 0.937500 0.062500 AMT_INCOME_TOTAL 224 0.000911
112 0.000000 1.000000 DAYS_REGISTRATION 605 0.002459
113 0.957025 0.042975 DAYS_REGISTRATION 605 0.002459
114 0.000000 1.000000 REGION_POPULATION_RELATIVE 6725 0.027337
115 0.959554 0.040446 REGION_POPULATION_RELATIVE 6725 0.027337

El código anterior y su respuesta nos devuelve el porcentaje de valores que se encuentran fuera de un intervalo de confianza definido en torno a la media de cada variable continua en list_var_cont, y devuelve un dataframe que describe la proporción de estos valores outliers. Además, examina cómo estos valores se distribuyen en función de una columna de 'TARGET' en el dataframe. Teoricamente la distribución de las columnas 0 y 1 de la tabla anterior debería seguir la misma distribución que la de train y test (0 = 0.92, 1= 0,08), aquellas variables en las que cambie drasticamente significaría que dicha columna tiene más o menos porcentaje de 0 o de 1. Pongamos por ejemplo la variable 'FLOORSMIN_AVG' donde la los outlier tienen un mayor porcentaje, es decir se encuentran más, en 'no dificultades para el pago' (pasa de 0.92 a 0.96 aprox)

In [23]:
f_aux.get_corr_matrix(dataset = pd_loan_train[list_var_cont], 
                metodo='pearson', size_figure=[10,8])
No description has been provided for this image
Out[23]:
0

El anterior código nos muestra las correlaciones que existen entre las variables. Al haber tantas variables es muy dificil de analizar e interpretar, no obstante podemos hacerlo a través del siguiente código con el cual podemos ver cuales son las variables con más relación entre sí, se escogen aquellas cuya correlación sea >70% ya que el conjunto de variables (columnas) es muy amplio. Vemos como las principales correlaciones son las mismas variables pero expresadas de otra forma como nos dice el diccionario de variables

In [24]:
corr = pd_loan_train[list_var_cont].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation'] > 0.7]
Out[24]:
level_0 level_1 correlation
3160 OBS_60_CNT_SOCIAL_CIRCLE OBS_30_CNT_SOCIAL_CIRCLE 0.998511
866 YEARS_BUILD_AVG YEARS_BUILD_MEDI 0.998477
680 FLOORSMIN_MEDI FLOORSMIN_AVG 0.997245
2479 FLOORSMAX_AVG FLOORSMAX_MEDI 0.997021
2045 ENTRANCES_AVG ENTRANCES_MEDI 0.996918
61 COMMONAREA_AVG COMMONAREA_MEDI 0.996382
1673 ELEVATORS_AVG ELEVATORS_MEDI 0.996020
2292 LIVINGAREA_MEDI LIVINGAREA_AVG 0.995843
495 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.995328
1859 APARTMENTS_AVG APARTMENTS_MEDI 0.995124
2665 YEARS_BEGINEXPLUATATION_AVG YEARS_BEGINEXPLUATATION_MEDI 0.993844
1239 BASEMENTAREA_AVG BASEMENTAREA_MEDI 0.993824
309 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_AVG 0.991780
1114 LANDAREA_AVG LANDAREA_MEDI 0.990134
1549 NONLIVINGAREA_MEDI NONLIVINGAREA_AVG 0.989556
867 YEARS_BUILD_AVG YEARS_BUILD_MODE 0.989399
805 YEARS_BUILD_MODE YEARS_BUILD_MEDI 0.989382
681 FLOORSMIN_MEDI FLOORSMIN_MODE 0.988760
2417 FLOORSMAX_MEDI FLOORSMAX_MODE 0.988430
3530 AMT_CREDIT AMT_GOODS_PRICE 0.986912
619 FLOORSMIN_MODE FLOORSMIN_AVG 0.986325
2478 FLOORSMAX_AVG FLOORSMAX_MODE 0.985861
1734 ELEVATORS_MODE ELEVATORS_MEDI 0.982601
1053 LANDAREA_MODE LANDAREA_MEDI 0.980747
2106 ENTRANCES_MODE ENTRANCES_MEDI 0.980217
122 COMMONAREA_MODE COMMONAREA_MEDI 0.979969
1735 ELEVATORS_MODE ELEVATORS_AVG 0.978467
1300 BASEMENTAREA_MODE BASEMENTAREA_MEDI 0.978067
123 COMMONAREA_MODE COMMONAREA_AVG 0.977409
2107 ENTRANCES_MODE ENTRANCES_AVG 0.977365
1920 APARTMENTS_MODE APARTMENTS_MEDI 0.976742
308 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_MODE 0.974976
2293 LIVINGAREA_MEDI LIVINGAREA_MODE 0.974665
1548 NONLIVINGAREA_MEDI NONLIVINGAREA_MODE 0.974584
494 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.974430
1301 BASEMENTAREA_MODE BASEMENTAREA_AVG 0.973156
1921 APARTMENTS_MODE APARTMENTS_AVG 0.972762
2664 YEARS_BEGINEXPLUATATION_AVG YEARS_BEGINEXPLUATATION_MODE 0.972237
2231 LIVINGAREA_MODE LIVINGAREA_AVG 0.972191
1115 LANDAREA_AVG LANDAREA_MODE 0.971778
433 LIVINGAPARTMENTS_AVG LIVINGAPARTMENTS_MODE 0.970417
247 NONLIVINGAPARTMENTS_AVG NONLIVINGAPARTMENTS_MODE 0.966745
1487 NONLIVINGAREA_AVG NONLIVINGAREA_MODE 0.963975
2603 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_MODE 0.962688
1837 APARTMENTS_AVG LIVINGAPARTMENTS_AVG 0.943765
1838 APARTMENTS_AVG LIVINGAPARTMENTS_MEDI 0.941273
1777 APARTMENTS_MEDI LIVINGAPARTMENTS_MEDI 0.941149
1897 APARTMENTS_MODE LIVINGAPARTMENTS_MODE 0.936658
1776 APARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.935679
1775 APARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.931484
1836 APARTMENTS_AVG LIVINGAPARTMENTS_MODE 0.930197
2719 TOTALAREA_MODE LIVINGAREA_AVG 0.923478
2721 TOTALAREA_MODE LIVINGAREA_MEDI 0.917987
2286 LIVINGAREA_MEDI APARTMENTS_MEDI 0.915855
2165 LIVINGAREA_AVG APARTMENTS_AVG 0.913874
2164 LIVINGAREA_AVG APARTMENTS_MEDI 0.912630
1899 APARTMENTS_MODE LIVINGAPARTMENTS_MEDI 0.912536
2287 LIVINGAREA_MEDI APARTMENTS_AVG 0.912312
2227 LIVINGAREA_MODE APARTMENTS_MODE 0.909923
1898 APARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.907557
2720 TOTALAREA_MODE LIVINGAREA_MODE 0.897088
2225 LIVINGAREA_MODE APARTMENTS_MEDI 0.895602
2288 LIVINGAREA_MEDI APARTMENTS_MODE 0.893889
2226 LIVINGAREA_MODE APARTMENTS_AVG 0.892967
2714 TOTALAREA_MODE APARTMENTS_AVG 0.890927
2166 LIVINGAREA_AVG APARTMENTS_MODE 0.890588
2713 TOTALAREA_MODE APARTMENTS_MEDI 0.884826
2265 LIVINGAREA_MEDI LIVINGAPARTMENTS_MEDI 0.883056
2143 LIVINGAREA_AVG LIVINGAPARTMENTS_MEDI 0.881635
2142 LIVINGAREA_AVG LIVINGAPARTMENTS_AVG 0.880556
2264 LIVINGAREA_MEDI LIVINGAPARTMENTS_AVG 0.877931
2202 LIVINGAREA_MODE LIVINGAPARTMENTS_MODE 0.877637
2263 LIVINGAREA_MEDI LIVINGAPARTMENTS_MODE 0.873259
2141 LIVINGAREA_AVG LIVINGAPARTMENTS_MODE 0.871946
2283 LIVINGAREA_MEDI ELEVATORS_MEDI 0.867918
2162 LIVINGAREA_AVG ELEVATORS_AVG 0.867434
2161 LIVINGAREA_AVG ELEVATORS_MEDI 0.865518
2284 LIVINGAREA_MEDI ELEVATORS_AVG 0.865096
2715 TOTALAREA_MODE APARTMENTS_MODE 0.861435
2204 LIVINGAREA_MODE LIVINGAPARTMENTS_MEDI 0.855399
2224 LIVINGAREA_MODE ELEVATORS_MODE 0.855391
2285 LIVINGAREA_MEDI ELEVATORS_MODE 0.855272
2163 LIVINGAREA_AVG ELEVATORS_MODE 0.852140
2203 LIVINGAREA_MODE LIVINGAPARTMENTS_AVG 0.851050
2691 TOTALAREA_MODE LIVINGAPARTMENTS_AVG 0.847094
2711 TOTALAREA_MODE ELEVATORS_AVG 0.844348
2692 TOTALAREA_MODE LIVINGAPARTMENTS_MEDI 0.843978
2222 LIVINGAREA_MODE ELEVATORS_MEDI 0.840442
2223 LIVINGAREA_MODE ELEVATORS_AVG 0.838194
2710 TOTALAREA_MODE ELEVATORS_MEDI 0.837895
1795 APARTMENTS_MEDI ELEVATORS_MEDI 0.837471
1857 APARTMENTS_AVG ELEVATORS_AVG 0.837014
1856 APARTMENTS_AVG ELEVATORS_MEDI 0.835223
1796 APARTMENTS_MEDI ELEVATORS_AVG 0.834493
2690 TOTALAREA_MODE LIVINGAPARTMENTS_MODE 0.832259
1919 APARTMENTS_MODE ELEVATORS_MODE 0.826284
1797 APARTMENTS_MEDI ELEVATORS_MODE 0.825548
1858 APARTMENTS_AVG ELEVATORS_MODE 0.822450
2712 TOTALAREA_MODE ELEVATORS_MODE 0.820235
1594 ELEVATORS_MEDI LIVINGAPARTMENTS_MEDI 0.812912
1654 ELEVATORS_AVG LIVINGAPARTMENTS_AVG 0.811501
1655 ELEVATORS_AVG LIVINGAPARTMENTS_MEDI 0.811312
1593 ELEVATORS_MEDI LIVINGAPARTMENTS_AVG 0.809061
1917 APARTMENTS_MODE ELEVATORS_MEDI 0.808644
1714 ELEVATORS_MODE LIVINGAPARTMENTS_MODE 0.806989
1918 APARTMENTS_MODE ELEVATORS_AVG 0.805672
1716 ELEVATORS_MODE LIVINGAPARTMENTS_MEDI 0.798237
1592 ELEVATORS_MEDI LIVINGAPARTMENTS_MODE 0.798123
1653 ELEVATORS_AVG LIVINGAPARTMENTS_MODE 0.796344
1715 ELEVATORS_MODE LIVINGAPARTMENTS_AVG 0.793822
3347 AMT_ANNUITY AMT_GOODS_PRICE 0.775213
3531 AMT_CREDIT AMT_ANNUITY 0.770161
2449 FLOORSMAX_AVG FLOORSMIN_AVG 0.742997
2390 FLOORSMAX_MEDI FLOORSMIN_MEDI 0.741033
2451 FLOORSMAX_AVG FLOORSMIN_MEDI 0.740569
2388 FLOORSMAX_MEDI FLOORSMIN_AVG 0.740496
2329 FLOORSMAX_MODE FLOORSMIN_MEDI 0.730576
2327 FLOORSMAX_MODE FLOORSMIN_AVG 0.729838
2328 FLOORSMAX_MODE FLOORSMIN_MODE 0.726738
2389 FLOORSMAX_MEDI FLOORSMIN_MODE 0.723336
2450 FLOORSMAX_AVG FLOORSMIN_MODE 0.722703

Vemos como las correlaciones suelen encontrarse en las mismas variables expuestas de diferentes forma como puede ser OBS_60_CNT_SOCIAL_CIRCLE y OBS_30_CNT_SOCIAL_CIRCLE siendo que estas observan cuántas observaciones del entorno social del cliente con morosidad se observaron 60 y 30 días de retraso, es decir, es posible que aquellas variables con nombre similar y correlacionadas expliquen lo mismo con diferente magnitud (más o menos días, promedio o mediana...)

Comprobamos que existen valores nulos para las variables continuas

In [25]:
f_aux.get_percent_null_values_target(pd_loan_train, list_var_cont, target='TARGET')
Out[25]:
0.0 1.0 variable sum_null_values porcentaje_sum_null_values
0 0.000000 1.000000 COMMONAREA_MEDI 171972 0.699050
1 0.914387 0.085613 COMMONAREA_MEDI 171972 0.699050
2 0.000000 1.000000 COMMONAREA_AVG 171972 0.699050
3 0.914387 0.085613 COMMONAREA_AVG 171972 0.699050
4 0.000000 1.000000 COMMONAREA_MODE 171972 0.699050
5 0.914387 0.085613 COMMONAREA_MODE 171972 0.699050
6 0.000000 1.000000 NONLIVINGAPARTMENTS_MODE 170909 0.694729
7 0.914270 0.085730 NONLIVINGAPARTMENTS_MODE 170909 0.694729
8 0.000000 1.000000 NONLIVINGAPARTMENTS_AVG 170909 0.694729
9 0.914270 0.085730 NONLIVINGAPARTMENTS_AVG 170909 0.694729
10 0.000000 1.000000 NONLIVINGAPARTMENTS_MEDI 170909 0.694729
11 0.914270 0.085730 NONLIVINGAPARTMENTS_MEDI 170909 0.694729
12 0.000000 1.000000 LIVINGAPARTMENTS_MODE 168277 0.684031
13 0.913850 0.086150 LIVINGAPARTMENTS_MODE 168277 0.684031
14 0.000000 1.000000 LIVINGAPARTMENTS_AVG 168277 0.684031
15 0.913850 0.086150 LIVINGAPARTMENTS_AVG 168277 0.684031
16 0.000000 1.000000 LIVINGAPARTMENTS_MEDI 168277 0.684031
17 0.913850 0.086150 LIVINGAPARTMENTS_MEDI 168277 0.684031
18 0.000000 1.000000 FLOORSMIN_AVG 166993 0.678811
19 0.913829 0.086171 FLOORSMIN_AVG 166993 0.678811
20 0.000000 1.000000 FLOORSMIN_MODE 166993 0.678811
21 0.913829 0.086171 FLOORSMIN_MODE 166993 0.678811
22 0.000000 1.000000 FLOORSMIN_MEDI 166993 0.678811
23 0.913829 0.086171 FLOORSMIN_MEDI 166993 0.678811
24 0.000000 1.000000 YEARS_BUILD_MEDI 163682 0.665352
25 0.913320 0.086680 YEARS_BUILD_MEDI 163682 0.665352
26 0.000000 1.000000 YEARS_BUILD_MODE 163682 0.665352
27 0.913320 0.086680 YEARS_BUILD_MODE 163682 0.665352
28 0.000000 1.000000 YEARS_BUILD_AVG 163682 0.665352
29 0.913320 0.086680 YEARS_BUILD_AVG 163682 0.665352
30 0.000000 1.000000 OWN_CAR_AGE 162223 0.659422
31 0.914716 0.085284 OWN_CAR_AGE 162223 0.659422
32 0.000000 1.000000 LANDAREA_MEDI 146053 0.593692
33 0.912059 0.087941 LANDAREA_MEDI 146053 0.593692
34 0.000000 1.000000 LANDAREA_MODE 146053 0.593692
35 0.912059 0.087941 LANDAREA_MODE 146053 0.593692
36 0.000000 1.000000 LANDAREA_AVG 146053 0.593692
37 0.912059 0.087941 LANDAREA_AVG 146053 0.593692
38 0.000000 1.000000 BASEMENTAREA_MEDI 143994 0.585322
39 0.911114 0.088886 BASEMENTAREA_MEDI 143994 0.585322
40 0.000000 1.000000 BASEMENTAREA_AVG 143994 0.585322
41 0.911114 0.088886 BASEMENTAREA_AVG 143994 0.585322
42 0.000000 1.000000 BASEMENTAREA_MODE 143994 0.585322
43 0.911114 0.088886 BASEMENTAREA_MODE 143994 0.585322
44 0.000000 1.000000 EXT_SOURCE_1 138885 0.564555
45 0.914764 0.085236 EXT_SOURCE_1 138885 0.564555
46 0.000000 1.000000 NONLIVINGAREA_MODE 135759 0.551848
47 0.909840 0.090160 NONLIVINGAREA_MODE 135759 0.551848
48 0.000000 1.000000 NONLIVINGAREA_AVG 135759 0.551848
49 0.909840 0.090160 NONLIVINGAREA_AVG 135759 0.551848
50 0.000000 1.000000 NONLIVINGAREA_MEDI 135759 0.551848
51 0.909840 0.090160 NONLIVINGAREA_MEDI 135759 0.551848
52 0.000000 1.000000 ELEVATORS_MEDI 131138 0.533064
53 0.909271 0.090729 ELEVATORS_MEDI 131138 0.533064
54 0.000000 1.000000 ELEVATORS_AVG 131138 0.533064
55 0.909271 0.090729 ELEVATORS_AVG 131138 0.533064
56 0.000000 1.000000 ELEVATORS_MODE 131138 0.533064
57 0.909271 0.090729 ELEVATORS_MODE 131138 0.533064
58 0.000000 1.000000 APARTMENTS_MEDI 124856 0.507528
59 0.908583 0.091417 APARTMENTS_MEDI 124856 0.507528
60 0.000000 1.000000 APARTMENTS_AVG 124856 0.507528
61 0.908583 0.091417 APARTMENTS_AVG 124856 0.507528
62 0.000000 1.000000 APARTMENTS_MODE 124856 0.507528
63 0.908583 0.091417 APARTMENTS_MODE 124856 0.507528
64 0.000000 1.000000 ENTRANCES_MEDI 123897 0.503630
65 0.908359 0.091641 ENTRANCES_MEDI 123897 0.503630
66 0.000000 1.000000 ENTRANCES_AVG 123897 0.503630
67 0.908359 0.091641 ENTRANCES_AVG 123897 0.503630
68 0.000000 1.000000 ENTRANCES_MODE 123897 0.503630
69 0.908359 0.091641 ENTRANCES_MODE 123897 0.503630
70 0.000000 1.000000 LIVINGAREA_AVG 123502 0.502024
71 0.908690 0.091310 LIVINGAREA_AVG 123502 0.502024
72 0.000000 1.000000 LIVINGAREA_MODE 123502 0.502024
73 0.908690 0.091310 LIVINGAREA_MODE 123502 0.502024
74 0.000000 1.000000 LIVINGAREA_MEDI 123502 0.502024
75 0.908690 0.091310 LIVINGAREA_MEDI 123502 0.502024
76 0.000000 1.000000 FLOORSMAX_MODE 122449 0.497744
77 0.908248 0.091752 FLOORSMAX_MODE 122449 0.497744
78 0.000000 1.000000 FLOORSMAX_MEDI 122449 0.497744
79 0.908248 0.091752 FLOORSMAX_MEDI 122449 0.497744
80 0.000000 1.000000 FLOORSMAX_AVG 122449 0.497744
81 0.908248 0.091752 FLOORSMAX_AVG 122449 0.497744
82 0.000000 1.000000 YEARS_BEGINEXPLUATATION_MODE 120063 0.488045
83 0.908048 0.091952 YEARS_BEGINEXPLUATATION_MODE 120063 0.488045
84 0.000000 1.000000 YEARS_BEGINEXPLUATATION_MEDI 120063 0.488045
85 0.908048 0.091952 YEARS_BEGINEXPLUATATION_MEDI 120063 0.488045
86 0.000000 1.000000 YEARS_BEGINEXPLUATATION_AVG 120063 0.488045
87 0.908048 0.091952 YEARS_BEGINEXPLUATATION_AVG 120063 0.488045
88 0.000000 1.000000 TOTALAREA_MODE 118781 0.482834
89 0.907780 0.092220 TOTALAREA_MODE 118781 0.482834
90 0.000000 1.000000 EXT_SOURCE_3 48916 0.198839
91 0.907658 0.092342 EXT_SOURCE_3 48916 0.198839
92 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_MON 33287 0.135309
93 0.896777 0.103223 AMT_REQ_CREDIT_BUREAU_MON 33287 0.135309
94 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_QRT 33287 0.135309
95 0.896777 0.103223 AMT_REQ_CREDIT_BUREAU_QRT 33287 0.135309
96 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_YEAR 33287 0.135309
97 0.896777 0.103223 AMT_REQ_CREDIT_BUREAU_YEAR 33287 0.135309
98 0.000000 1.000000 OBS_30_CNT_SOCIAL_CIRCLE 819 0.003329
99 0.960928 0.039072 OBS_30_CNT_SOCIAL_CIRCLE 819 0.003329
100 0.000000 1.000000 DEF_30_CNT_SOCIAL_CIRCLE 819 0.003329
101 0.960928 0.039072 DEF_30_CNT_SOCIAL_CIRCLE 819 0.003329
102 0.000000 1.000000 OBS_60_CNT_SOCIAL_CIRCLE 819 0.003329
103 0.960928 0.039072 OBS_60_CNT_SOCIAL_CIRCLE 819 0.003329
104 0.000000 1.000000 EXT_SOURCE_2 543 0.002207
105 0.911602 0.088398 EXT_SOURCE_2 543 0.002207
106 0.000000 1.000000 AMT_GOODS_PRICE 217 0.000882
107 0.921659 0.078341 AMT_GOODS_PRICE 217 0.000882
108 0.000000 1.000000 AMT_ANNUITY 9 0.000037
109 1.000000 0.000000 AMT_ANNUITY 9 0.000037
110 0.000000 1.000000 CNT_FAM_MEMBERS 2 0.000008
111 1.000000 0.000000 CNT_FAM_MEMBERS 2 0.000008
112 0.000000 1.000000 DAYS_LAST_PHONE_CHANGE 1 0.000004
113 1.000000 0.000000 DAYS_LAST_PHONE_CHANGE 1 0.000004

Vamos a cambiar los valores nulos de las columnas continuas por la mediana de los valores no nulos de la columna, se ha elegido la mediana ya que a diferencia de la media (promedio), la mediana no se ve afectada por valores atípicos o extremos en los datos. Al tener un dataset con variables con valores muy extremos (como se puede observar en los gráficos) la mediana es una mejor medida de tendencia central porque representa el punto medio de la distribución.

In [ ]:
train_median = pd_loan_train[list_var_cont].median()
pd_loan_train[list_var_cont] = pd_loan_train[list_var_cont].fillna(train_median)
pd_loan_test[list_var_cont] = pd_loan_test[list_var_cont].fillna(train_median)

Comprobamos que no existen valores nulos tras la transformación ya que estos han sido sustituidos por la mediana de la variable.

In [27]:
f_aux.get_percent_null_values_target(pd_loan_train, list_var_cont, target='TARGET')
No existen variables con valores nulos
Out[27]:

Tratamiento de variables CATEGÓRICAS

In [28]:
list_var_cat
Out[28]:
['FONDKAPREMONT_MODE',
 'WALLSMATERIAL_MODE',
 'HOUSETYPE_MODE',
 'EMERGENCYSTATE_MODE',
 'OCCUPATION_TYPE',
 'NAME_TYPE_SUITE',
 'CNT_CHILDREN',
 'FLAG_DOCUMENT_8',
 'NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_21',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_OWN_REALTY',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_12',
 'FLAG_PHONE',
 'LIVE_CITY_NOT_WORK_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'TARGET',
 'REG_CITY_NOT_LIVE_CITY',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'REG_REGION_NOT_LIVE_REGION',
 'HOUR_APPR_PROCESS_START',
 'WEEKDAY_APPR_PROCESS_START',
 'REGION_RATING_CLIENT_W_CITY',
 'REGION_RATING_CLIENT',
 'FLAG_EMAIL',
 'FLAG_CONT_MOBILE',
 'ORGANIZATION_TYPE',
 'FLAG_WORK_PHONE',
 'FLAG_EMP_PHONE',
 'FLAG_MOBIL',
 'NAME_HOUSING_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_EDUCATION_TYPE',
 'NAME_INCOME_TYPE']

Ahora se va a ver si existe correlación entre algunas de las variables categóricas que puedan ser importantes para el análisis de la dificultad frente a los pagos. He creado un bucle adicional que me devuelva cada valor de la V de Cramer para cada variable categórica y que se ordene de mayor relación a menor.

In [ ]:
def cramers_v(matrix):
    chi2, p, dof, ex = chi2_contingency(matrix)  # Chi-squared test
    return np.sqrt(chi2 / (matrix.sum().sum() * (min(matrix.shape) - 1)))

cramers_v_results = {}

for col in list_var_cat:
    # We calculate the confusion matrix between the categorical variable and TARGET.
    confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train[col])
    
    # We calculate the Cramér's V for this matrix
    cramers_v_value = cramers_v(confusion_matrix.values)
    
    # We save the result in the dictionary
    cramers_v_results[col] = cramers_v_value

cramers_v_df = pd.DataFrame(list(cramers_v_results.items()), columns=['Variable', 'Cramér\'s V'])
cramers_v_df = cramers_v_df.sort_values(by='Cramér\'s V', ascending=False)

print(cramers_v_df)
                       Variable  Cramér's V
34                       TARGET    0.999973
4               OCCUPATION_TYPE    0.080065
45            ORGANIZATION_TYPE    0.074128
52             NAME_INCOME_TYPE    0.064792
41  REGION_RATING_CLIENT_W_CITY    0.061046
42         REGION_RATING_CLIENT    0.059173
51          NAME_EDUCATION_TYPE    0.056335
9                   CODE_GENDER    0.055191
33       REG_CITY_NOT_WORK_CITY    0.050677
47               FLAG_EMP_PHONE    0.046979
12              FLAG_DOCUMENT_3    0.044897
35       REG_CITY_NOT_LIVE_CITY    0.043686
50           NAME_FAMILY_STATUS    0.039488
49            NAME_HOUSING_TYPE    0.036335
32      LIVE_CITY_NOT_WORK_CITY    0.032358
8            NAME_CONTRACT_TYPE    0.031104
39      HOUR_APPR_PROCESS_START    0.029447
15              FLAG_DOCUMENT_6    0.029307
46              FLAG_WORK_PHONE    0.028350
1            WALLSMATERIAL_MODE    0.027483
6                  CNT_CHILDREN    0.023839
31                   FLAG_PHONE    0.023379
10                 FLAG_OWN_CAR    0.023257
0            FONDKAPREMONT_MODE    0.013820
25             FLAG_DOCUMENT_16    0.012795
3           EMERGENCYSTATE_MODE    0.011982
22             FLAG_DOCUMENT_13    0.011592
5               NAME_TYPE_SUITE    0.011143
2                HOUSETYPE_MODE    0.010926
23             FLAG_DOCUMENT_14    0.009716
7               FLAG_DOCUMENT_8    0.008502
40   WEEKDAY_APPR_PROCESS_START    0.008191
27             FLAG_DOCUMENT_18    0.007826
37   REG_REGION_NOT_WORK_REGION    0.006667
24             FLAG_DOCUMENT_15    0.006508
38   REG_REGION_NOT_LIVE_REGION    0.005940
21              FLAG_OWN_REALTY    0.005210
11              FLAG_DOCUMENT_2    0.003597
20             FLAG_DOCUMENT_11    0.003368
17              FLAG_DOCUMENT_9    0.003245
43                   FLAG_EMAIL    0.003182
26             FLAG_DOCUMENT_17    0.002522
36  LIVE_REGION_NOT_WORK_REGION    0.002343
16              FLAG_DOCUMENT_7    0.002290
28             FLAG_DOCUMENT_19    0.002014
18             FLAG_DOCUMENT_21    0.001927
13              FLAG_DOCUMENT_4    0.001845
29             FLAG_DOCUMENT_20    0.001004
44             FLAG_CONT_MOBILE    0.000883
14              FLAG_DOCUMENT_5    0.000258
19             FLAG_DOCUMENT_10    0.000000
48                   FLAG_MOBIL    0.000000
30             FLAG_DOCUMENT_12    0.000000

Observamos como las variables categóricas, salvo la propia 'TARGET', tienen todas un valor de la V de Cramers muy bajo , todas ellas inferiores a 0.1, esto significa que las variables categóricas tienen una asociación muy débil con la variable 'TARGET'. Dado que las variables categóricas no muestran una correlación fuerte con 'TARGET', podríamos considerar reducir su peso en el modelo o explorar otras características que podrían tener una mayor influencia. Esto indica que es posible que la variable TARGET dependa más de variables numéricas o continuas que de las variables categóricas. Por lo que sería interesante investigar la correlación de TARGET con las variables continuas y explorar la importancia de estas características para predecir TARGET."

In [ ]:
pd_loan_train[list_var_cat] = pd_loan_train[list_var_cat].astype("object").fillna("VALUELESS").astype("category")
pd_loan_test[list_var_cat] = pd_loan_test[list_var_cat].astype("object").fillna("VALUELESS").astype("category")
In [31]:
pd_loan_train['AMT_REQ_CREDIT_BUREAU_DAY'] = pd_loan_train['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(0)
pd_loan_train['AMT_REQ_CREDIT_BUREAU_WEEK'] = pd_loan_train['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(0)
pd_loan_train['AMT_REQ_CREDIT_BUREAU_HOUR'] = pd_loan_train['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0)
pd_loan_train['DEF_60_CNT_SOCIAL_CIRCLE'] = pd_loan_train['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(0)
pd_loan_test['AMT_REQ_CREDIT_BUREAU_DAY'] = pd_loan_test['AMT_REQ_CREDIT_BUREAU_DAY'].fillna(0)
pd_loan_test['AMT_REQ_CREDIT_BUREAU_WEEK'] = pd_loan_test['AMT_REQ_CREDIT_BUREAU_WEEK'].fillna(0)
pd_loan_test['AMT_REQ_CREDIT_BUREAU_HOUR'] = pd_loan_test['AMT_REQ_CREDIT_BUREAU_HOUR'].fillna(0)
pd_loan_test['DEF_60_CNT_SOCIAL_CIRCLE'] = pd_loan_test['DEF_60_CNT_SOCIAL_CIRCLE'].fillna(0)
In [32]:
pd_loan_train.isna().sum().sum()
Out[32]:
0

A través del código anterior lo que se hace es transformar las columnas categóricas a objecto, un formato de variable más 'flexible', rellena los NaN con la cade de texto 'SIN VALOR' y despues las vuelve a transformar a category.

In [33]:
pd_loan_train.to_csv("../data/train_pd_data_preprocessing_missing_outlier.csv")
pd_loan_test.to_csv("../data/test_pd_data_preprocessing_missing_outlier.csv")

Volvemos a guardar los datasets de train y test que hemos utilizado y manipulado en formato .csv para seguir podiendo utilizarlos en el siguiente notebook